library(ggplot2)
library(arules)
library(arulesViz)
library(dplyr)
library(plotly)
### Your working directory will be different.
setwd("/Users/aoliv01/Desktop/GradSchool/2018-2/DataMining/Homework/HW3")
getwd()
[1] "/Users/aoliv01/Desktop/GradSchool/2018-2/DataMining/Homework/HW3"
### Reading in the data
raw <- read.csv("bankdata_csv_all.csv")
str(raw)
'data.frame': 600 obs. of 12 variables:
$ id : Factor w/ 600 levels "ID12101","ID12102",..: 1 2 3 4 5 6 7 8 9 10 ...
$ age : int 48 40 51 23 57 57 22 58 37 54 ...
$ sex : Factor w/ 2 levels "FEMALE","MALE": 1 2 1 1 1 1 2 2 1 2 ...
$ region : Factor w/ 4 levels "INNER_CITY","RURAL",..: 1 4 1 4 2 4 2 4 3 4 ...
$ income : num 17546 30085 16575 20375 50576 ...
$ married : Factor w/ 2 levels "NO","YES": 1 2 2 2 2 2 1 2 2 2 ...
$ children : int 1 3 0 3 0 2 0 0 2 2 ...
$ car : Factor w/ 2 levels "NO","YES": 1 2 2 1 1 1 1 2 2 2 ...
$ save_act : Factor w/ 2 levels "NO","YES": 1 1 2 1 2 2 1 2 1 2 ...
$ current_act: Factor w/ 2 levels "NO","YES": 1 2 2 2 1 2 2 2 1 2 ...
$ mortgage : Factor w/ 2 levels "NO","YES": 1 2 1 1 1 1 1 1 1 1 ...
$ pep : Factor w/ 2 levels "NO","YES": 2 1 1 1 1 2 2 1 1 1 ...
attach(raw)
The following objects are masked from raw (pos = 3):
age, car, children, current_act, id, income, married, mortgage, pep, region,
save_act, sex
The following objects are masked from raw (pos = 6):
age, car, children, current_act, id, income, married, mortgage, pep, region,
save_act, sex
## Checking for null records
total_na <- sum(is.na(raw))
cat("Total na is: ", total_na)
Total na is: 0
## Removing the id column
raw <- raw[,-1]
## Quick boxplot to check for income outliers
income_box <- ggplot(raw, aes(x = pep, y = income)) +
geom_boxplot(fill = "steelblue", color = "darkorange") +
theme(plot.title = element_text(hjust = 0.5, color = "darkorange"))
income_box

## Box plot to check for age outliers or odd values
age_box <- ggplot(raw, aes(x = pep, y = age)) +
geom_boxplot(fill = "steelblue", color = "darkorange") +
theme(plot.title = element_text(hjust = 0.5, color = "darkorange"))
age_box

## Chart showing income and age of PEP customers and not PEP customers
ggplot(data = raw, aes(x = income, y = age)) +
geom_point(colour = "steelblue", alpha = .5) +
geom_smooth(method = "lm") +
facet_wrap(~ pep) +
ggtitle("PEP by Age and Income")

## Checking to see that age and income for PEP and non-PEP are comparable
## This will also show outliers
aggregate(income ~ pep, data = raw, median)
aggregate(age ~ pep, data = raw, median)
aggregate(income ~ pep, data = raw, max)
aggregate(age ~ pep, data = raw, max)
aggregate(income ~ pep, data = raw, min)
aggregate(age ~ pep, data = raw, min)
## Discretizing record data into transactional data
change <- c("id", "age", "income", "children")
disc <- raw[ , !(names(raw) %in% change)]
disc$age <- discretize(
raw$age,
method = "cluster",
labels = c("young", "middle", "older")
)
disc$income <- discretize(
raw$income,
method = "cluster",
labels = c("low", "medium", "high")
)
disc$children <- discretize(
raw$children,
method = "cluster",
labels = c("low", "medium", "many")
)
detach(raw)
summary(disc)
sex region married car save_act current_act mortgage pep
FEMALE:300 INNER_CITY:269 NO :204 NO :304 NO :186 NO :145 NO :391 NO :326
MALE :300 RURAL : 96 YES:396 YES:296 YES:414 YES:455 YES:209 YES:274
SUBURBAN : 62
TOWN :173
age income children
young :183 low :273 low :398
middle:226 medium:221 medium:134
older :191 high :106 many : 68
## Running apriori algorithm
rules <- apriori(disc, parameter = list(supp = 0.025, conf = 0.75, maxlen = 4))
Apriori
Parameter specification:
confidence minval smax arem aval originalSupport maxtime support minlen maxlen target ext
0.75 0.1 1 none FALSE TRUE 5 0.025 1 4 rules FALSE
Algorithmic control:
filter tree heap memopt load sort verbose
0.1 TRUE TRUE FALSE TRUE 2 TRUE
Absolute minimum support count: 15
set item appearances ...[0 item(s)] done [0.00s].
set transactions ...[27 item(s), 600 transaction(s)] done [0.00s].
sorting and recoding items ... [27 item(s)] done [0.00s].
creating transaction tree ... done [0.00s].
checking subsets of size 1 2 3 4
Mining stopped (maxlen reached). Only patterns up to a length of 4 returned!
done [0.00s].
writing ... [2293 rule(s)] done [0.00s].
creating S4 object ... done [0.00s].
rules<-sort(rules, by="confidence", decreasing=TRUE)
options(digits=2)
inspect(head(rules, 20))
lhs rhs support confidence lift count
[1] {income=high} => {save_act=YES} 0.177 1 1.4 106
[2] {save_act=NO,children=many} => {pep=NO} 0.037 1 1.8 22
[3] {region=RURAL,income=high} => {save_act=YES} 0.045 1 1.4 27
[4] {income=high,children=medium} => {save_act=YES} 0.047 1 1.4 28
[5] {current_act=NO,income=high} => {save_act=YES} 0.035 1 1.4 21
[6] {region=TOWN,income=high} => {save_act=YES} 0.038 1 1.4 23
[7] {age=older,income=high} => {save_act=YES} 0.150 1 1.4 90
[8] {married=NO,income=high} => {save_act=YES} 0.057 1 1.4 34
[9] {mortgage=YES,income=high} => {save_act=YES} 0.057 1 1.4 34
[10] {age=middle,income=high} => {save_act=YES} 0.027 1 1.4 16
[11] {region=INNER_CITY,income=high} => {save_act=YES} 0.073 1 1.4 44
[12] {pep=YES,income=high} => {save_act=YES} 0.105 1 1.4 63
[13] {car=YES,income=high} => {save_act=YES} 0.090 1 1.4 54
[14] {sex=MALE,income=high} => {save_act=YES} 0.088 1 1.4 53
[15] {sex=FEMALE,income=high} => {save_act=YES} 0.088 1 1.4 53
[16] {car=NO,income=high} => {save_act=YES} 0.087 1 1.4 52
[17] {pep=NO,income=high} => {save_act=YES} 0.072 1 1.4 43
[18] {mortgage=NO,income=high} => {save_act=YES} 0.120 1 1.4 72
[19] {married=YES,income=high} => {save_act=YES} 0.120 1 1.4 72
[20] {income=high,children=low} => {save_act=YES} 0.110 1 1.4 66
# Finding the highest ranked rules by support and confidence
# Writing a csv file of the rules
highest <- data.frame(
lhs = labels(lhs(rules)),
rhs = labels(rhs(rules)),
rules@quality)
highest <- highest[which(highest$support > mean(highest$support) & highest$confidence > mean(highest$confidence) & highest$lift > 1),]
highest <- highest[order(highest$confidence, highest$support),]
highest <- highest[1:25,]
write.csv(highest, file = "BankRuleSets.csv")
## The top rules for rhs = PEP
inspect(subset(rules, subset = support > mean(support) & confidence > mean(confidence) & lift > mean(lift) & rhs %pin% "pep=YES" ))
lhs rhs support confidence lift count
[1] {married=NO,mortgage=NO,children=low} => {pep=YES} 0.12 0.92 2 70
## All rules for rhs = PEP
inspect(subset(rules, subset = rhs %pin% "pep=YES" ))
lhs rhs support confidence lift count
[1] {married=NO,mortgage=NO,income=high} => {pep=YES} 0.040 0.96 2.1 24
[2] {married=NO,mortgage=NO,children=low} => {pep=YES} 0.117 0.92 2.0 70
[3] {age=older,income=high,children=medium} => {pep=YES} 0.037 0.92 2.0 22
[4] {income=high,children=medium} => {pep=YES} 0.042 0.89 2.0 25
[5] {save_act=YES,income=high,children=medium} => {pep=YES} 0.042 0.89 2.0 25
[6] {mortgage=NO,income=high,children=medium} => {pep=YES} 0.027 0.89 1.9 16
[7] {married=NO,save_act=NO,children=low} => {pep=YES} 0.065 0.89 1.9 39
[8] {sex=MALE,income=high,children=medium} => {pep=YES} 0.025 0.88 1.9 15
[9] {sex=MALE,married=NO,income=high} => {pep=YES} 0.025 0.88 1.9 15
[10] {current_act=YES,income=high,children=medium} => {pep=YES} 0.035 0.88 1.9 21
[11] {married=NO,mortgage=NO,age=older} => {pep=YES} 0.067 0.87 1.9 40
[12] {save_act=NO,mortgage=YES,children=low} => {pep=YES} 0.068 0.85 1.9 41
[13] {sex=MALE,married=NO,age=older} => {pep=YES} 0.028 0.85 1.9 17
[14] {married=NO,save_act=NO,age=older} => {pep=YES} 0.025 0.83 1.8 15
[15] {married=NO,income=high} => {pep=YES} 0.047 0.82 1.8 28
[16] {married=NO,save_act=YES,income=high} => {pep=YES} 0.047 0.82 1.8 28
[17] {married=NO,current_act=YES,income=high} => {pep=YES} 0.038 0.82 1.8 23
[18] {married=NO,age=older,income=high} => {pep=YES} 0.037 0.81 1.8 22
[19] {married=NO,age=older,children=low} => {pep=YES} 0.052 0.79 1.7 31
[20] {region=INNER_CITY,married=NO,age=older} => {pep=YES} 0.045 0.79 1.7 27
[21] {married=NO,car=NO,age=older} => {pep=YES} 0.038 0.79 1.7 23
[22] {region=INNER_CITY,married=NO,children=low} => {pep=YES} 0.080 0.79 1.7 48
[23] {married=NO,current_act=NO,children=low} => {pep=YES} 0.035 0.78 1.7 21
[24] {married=NO,mortgage=NO,income=medium} => {pep=YES} 0.068 0.77 1.7 41
[25] {sex=MALE,age=older,children=medium} => {pep=YES} 0.028 0.77 1.7 17
[26] {save_act=NO,age=older,children=low} => {pep=YES} 0.033 0.77 1.7 20
[27] {married=NO,income=medium,children=low} => {pep=YES} 0.072 0.77 1.7 43
[28] {car=YES,mortgage=NO,income=high} => {pep=YES} 0.045 0.75 1.6 27
## Formatting a new dataset to plot
ruledf = data.frame(
lhs = labels(lhs(rules)),
rhs = labels(rhs(rules)),
rules@quality)
ruledf$pep <- ifelse(ruledf$rhs == "{pep=YES}", 'YesPep', 'NoPep')
## Plotting the new dataset in an interactive 3D plot
p <- plot_ly(ruledf,
x = ~confidence,
y = ~lift,
z = ~support,
color = ~ruledf$pep,
colors = c('steelblue', 'darkorange'),
marker = list(size = 4, opacity = 0.35)
) %>%
add_markers() %>%
layout(scene = list(
xaxis = list(title = 'support'),
yaxis = list(title = 'confidence'),
zaxis = list(title = 'lift')
))
p
LS0tCnRpdGxlOiAiSG9tZXdvcmsgMyAtIE9saXZpZXJpIgpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sKLS0tCmBgYHtyfQpsaWJyYXJ5KGdncGxvdDIpCmxpYnJhcnkoYXJ1bGVzKQpsaWJyYXJ5KGFydWxlc1ZpeikKbGlicmFyeShkcGx5cikKbGlicmFyeShwbG90bHkpCmBgYAoKYGBge3J9CiMjIyBZb3VyIHdvcmtpbmcgZGlyZWN0b3J5IHdpbGwgYmUgZGlmZmVyZW50LgpzZXR3ZCgiL1VzZXJzL2FvbGl2MDEvRGVza3RvcC9HcmFkU2Nob29sLzIwMTgtMi9EYXRhTWluaW5nL0hvbWV3b3JrL0hXMyIpCmdldHdkKCkKYGBgCgpgYGB7cn0KIyMjIFJlYWRpbmcgaW4gdGhlIGRhdGEKcmF3IDwtIHJlYWQuY3N2KCJiYW5rZGF0YV9jc3ZfYWxsLmNzdiIpCnN0cihyYXcpCmBgYApgYGB7cn0KYXR0YWNoKHJhdykKYGBgCgpgYGB7cn0KIyMgQ2hlY2tpbmcgZm9yIG51bGwgcmVjb3Jkcwp0b3RhbF9uYSA8LSBzdW0oaXMubmEocmF3KSkKY2F0KCJUb3RhbCBuYSBpczogIiwgdG90YWxfbmEpCmBgYAoKYGBge3J9CiMjIFJlbW92aW5nIHRoZSBpZCBjb2x1bW4KcmF3IDwtIHJhd1ssLTFdCmBgYAoKYGBge3J9CiMjIFF1aWNrIGJveHBsb3QgdG8gY2hlY2sgZm9yIGluY29tZSBvdXRsaWVycwppbmNvbWVfYm94IDwtIGdncGxvdChyYXcsIGFlcyh4ID0gcGVwLCB5ID0gaW5jb21lKSkgKyAKICBnZW9tX2JveHBsb3QoZmlsbCA9ICJzdGVlbGJsdWUiLCBjb2xvciA9ICJkYXJrb3JhbmdlIikgKwogIHRoZW1lKHBsb3QudGl0bGUgPSBlbGVtZW50X3RleHQoaGp1c3QgPSAwLjUsIGNvbG9yID0gImRhcmtvcmFuZ2UiKSkKCmluY29tZV9ib3gKYGBgCgpgYGB7cn0KIyMgQm94IHBsb3QgdG8gY2hlY2sgZm9yIGFnZSBvdXRsaWVycyBvciBvZGQgdmFsdWVzCmFnZV9ib3ggPC0gZ2dwbG90KHJhdywgYWVzKHggPSBwZXAsIHkgPSBhZ2UpKSArIAogIGdlb21fYm94cGxvdChmaWxsID0gInN0ZWVsYmx1ZSIsIGNvbG9yID0gImRhcmtvcmFuZ2UiKSArCiAgdGhlbWUocGxvdC50aXRsZSA9IGVsZW1lbnRfdGV4dChoanVzdCA9IDAuNSwgY29sb3IgPSAiZGFya29yYW5nZSIpKQphZ2VfYm94CmBgYAoKYGBge3J9CiMjIENoYXJ0IHNob3dpbmcgaW5jb21lIGFuZCBhZ2Ugb2YgUEVQIGN1c3RvbWVycyBhbmQgbm90IFBFUCBjdXN0b21lcnMKZ2dwbG90KGRhdGEgPSByYXcsIGFlcyh4ID0gaW5jb21lLCB5ID0gYWdlKSkgKyAKICBnZW9tX3BvaW50KGNvbG91ciA9ICJzdGVlbGJsdWUiLCBhbHBoYSA9IC41KSArIAogIGdlb21fc21vb3RoKG1ldGhvZCA9ICJsbSIpICsKICBmYWNldF93cmFwKH4gcGVwKSArIAogIGdndGl0bGUoIlBFUCBieSBBZ2UgYW5kIEluY29tZSIpCmBgYAoKYGBge3J9CiMjIENoZWNraW5nIHRvIHNlZSB0aGF0IGFnZSBhbmQgaW5jb21lIGZvciBQRVAgYW5kIG5vbi1QRVAgYXJlIGNvbXBhcmFibGUKIyMgVGhpcyB3aWxsIGFsc28gc2hvdyBvdXRsaWVycwphZ2dyZWdhdGUoaW5jb21lIH4gcGVwLCBkYXRhID0gcmF3LCBtZWRpYW4pCmFnZ3JlZ2F0ZShhZ2UgfiBwZXAsIGRhdGEgPSByYXcsIG1lZGlhbikKYWdncmVnYXRlKGluY29tZSB+IHBlcCwgZGF0YSA9IHJhdywgbWF4KQphZ2dyZWdhdGUoYWdlIH4gcGVwLCBkYXRhID0gcmF3LCBtYXgpCmFnZ3JlZ2F0ZShpbmNvbWUgfiBwZXAsIGRhdGEgPSByYXcsIG1pbikKYWdncmVnYXRlKGFnZSB+IHBlcCwgZGF0YSA9IHJhdywgbWluKQpgYGAKCgpgYGB7cn0KIyMgRGlzY3JldGl6aW5nIHJlY29yZCBkYXRhIGludG8gdHJhbnNhY3Rpb25hbCBkYXRhCmNoYW5nZSA8LSBjKCJpZCIsICJhZ2UiLCAiaW5jb21lIiwgImNoaWxkcmVuIikKZGlzYyA8LSByYXdbICwgIShuYW1lcyhyYXcpICVpbiUgY2hhbmdlKV0KCmRpc2MkYWdlIDwtIGRpc2NyZXRpemUoCiAgcmF3JGFnZSwKICBtZXRob2QgPSAiY2x1c3RlciIsCiAgbGFiZWxzID0gYygieW91bmciLCAibWlkZGxlIiwgIm9sZGVyIikKICApCmRpc2MkaW5jb21lIDwtIGRpc2NyZXRpemUoCiAgICByYXckaW5jb21lLAogICAgbWV0aG9kID0gImNsdXN0ZXIiLAogICAgbGFiZWxzID0gYygibG93IiwgIm1lZGl1bSIsICJoaWdoIikKICApCmRpc2MkY2hpbGRyZW4gPC0gZGlzY3JldGl6ZSgKICAgIHJhdyRjaGlsZHJlbiwKICAgIG1ldGhvZCA9ICJjbHVzdGVyIiwKICAgIGxhYmVscyA9IGMoImxvdyIsICJtZWRpdW0iLCAibWFueSIpCiAgKQpkZXRhY2gocmF3KQpzdW1tYXJ5KGRpc2MpCmBgYAoKCmBgYHtyfQojIyBSdW5uaW5nIGFwcmlvcmkgYWxnb3JpdGhtCnJ1bGVzIDwtIGFwcmlvcmkoZGlzYywgcGFyYW1ldGVyID0gbGlzdChzdXBwID0gMC4wMjUsIGNvbmYgPSAwLjc1LCBtYXhsZW4gPSA0KSkKcnVsZXM8LXNvcnQocnVsZXMsIGJ5PSJjb25maWRlbmNlIiwgZGVjcmVhc2luZz1UUlVFKQpvcHRpb25zKGRpZ2l0cz0yKQppbnNwZWN0KGhlYWQocnVsZXMsIDIwKSkKYGBgCgpgYGB7cn0KIyBGaW5kaW5nIHRoZSBoaWdoZXN0IHJhbmtlZCBydWxlcyBieSBzdXBwb3J0IGFuZCBjb25maWRlbmNlCiMgV3JpdGluZyBhIGNzdiBmaWxlIG9mIHRoZSBydWxlcwpoaWdoZXN0IDwtIGRhdGEuZnJhbWUoCiAgICAgICBsaHMgPSBsYWJlbHMobGhzKHJ1bGVzKSksCiAgICAgICByaHMgPSBsYWJlbHMocmhzKHJ1bGVzKSksCiAgICAgIHJ1bGVzQHF1YWxpdHkpCmhpZ2hlc3QgPC0gaGlnaGVzdFt3aGljaChoaWdoZXN0JHN1cHBvcnQgPiBtZWFuKGhpZ2hlc3Qkc3VwcG9ydCkgJiBoaWdoZXN0JGNvbmZpZGVuY2UgPiBtZWFuKGhpZ2hlc3QkY29uZmlkZW5jZSkgJiBoaWdoZXN0JGxpZnQgPiAxKSxdCmhpZ2hlc3QgPC0gaGlnaGVzdFtvcmRlcihoaWdoZXN0JGNvbmZpZGVuY2UsIGhpZ2hlc3Qkc3VwcG9ydCksXQpoaWdoZXN0IDwtIGhpZ2hlc3RbMToyNSxdCndyaXRlLmNzdihoaWdoZXN0LCBmaWxlID0gIkJhbmtSdWxlU2V0cy5jc3YiKQpgYGAKCgpgYGB7cn0KIyMgVGhlIHRvcCBydWxlcyBmb3IgcmhzID0gUEVQCmluc3BlY3Qoc3Vic2V0KHJ1bGVzLCBzdWJzZXQgPSBzdXBwb3J0ID4gbWVhbihzdXBwb3J0KSAmIGNvbmZpZGVuY2UgPiBtZWFuKGNvbmZpZGVuY2UpICYgbGlmdCA+IG1lYW4obGlmdCkgJiAgcmhzICVwaW4lICJwZXA9WUVTIiApKQpgYGAKCmBgYHtyfQojIyBBbGwgcnVsZXMgZm9yIHJocyA9IFBFUAppbnNwZWN0KHN1YnNldChydWxlcywgc3Vic2V0ID0gcmhzICVwaW4lICJwZXA9WUVTIiApKQpgYGAKCmBgYHtyfQojIyBGb3JtYXR0aW5nIGEgbmV3IGRhdGFzZXQgdG8gcGxvdApydWxlZGYgPSBkYXRhLmZyYW1lKAogICAgICAgbGhzID0gbGFiZWxzKGxocyhydWxlcykpLAogICAgICAgcmhzID0gbGFiZWxzKHJocyhydWxlcykpLCAKICAgICAgIHJ1bGVzQHF1YWxpdHkpCnJ1bGVkZiRwZXAgPC0gaWZlbHNlKHJ1bGVkZiRyaHMgPT0gIntwZXA9WUVTfSIsICdZZXNQZXAnLCAnTm9QZXAnKQoKIyMgUGxvdHRpbmcgdGhlIG5ldyBkYXRhc2V0IGluIGFuIGludGVyYWN0aXZlIDNEIHBsb3QKcCA8LSBwbG90X2x5KHJ1bGVkZiwKICAgICAgICAgICAgICB4ID0gfmNvbmZpZGVuY2UsCiAgICAgICAgICAgICAgeSA9IH5saWZ0LAogICAgICAgICAgICAgIHogPSB+c3VwcG9ydCwKICAgICAgICAgICAgICBjb2xvciA9IH5ydWxlZGYkcGVwLAogICAgICAgICAgICAgIGNvbG9ycyA9IGMoJ3N0ZWVsYmx1ZScsICdkYXJrb3JhbmdlJyksCiAgICAgICAgICAgICAgbWFya2VyID0gbGlzdChzaXplID0gNCwgb3BhY2l0eSA9IDAuMzUpCiAgICAgICAgICAgICApICU+JSAKICAgIGFkZF9tYXJrZXJzKCkgJT4lCiAgICBsYXlvdXQoc2NlbmUgPSBsaXN0KAogICAgICAgICAgICAgICAgICAgeGF4aXMgPSBsaXN0KHRpdGxlID0gJ3N1cHBvcnQnKSwKICAgICAgICAgICAgICAgICAgIHlheGlzID0gbGlzdCh0aXRsZSA9ICdjb25maWRlbmNlJyksCiAgICAgICAgICAgICAgICAgICB6YXhpcyA9IGxpc3QodGl0bGUgPSAnbGlmdCcpCiAgICAgICAgICAgICAgICAgICApKQpwCmBgYAoKCg==